Outline

There are more and more bioconductor packages supporting single-cell data analysis. R Amezquita, A Lun, S Hicks, and R Gottardo wrote an integrated workflow, Orchestrating Single-Cell Analysis with Bioconductor, for single-cell data analysis and quality accessment. In this session, we will go through several important QC metrics which can’t be made with Seurat.

  • how to differentiate empty droplets?
  • how to estimate ambient RNA and remove them?
  • how to identify doublets?
  • any confounded effect?

load data and access empty drops


Load data from Cell Ranger Result

library(DropletUtils)
library(DropletTestFiles)
fname <- "file path to Cell Ranger results"
sce <- read10xCounts(fname, col.names = TRUE)
cellID <- colData(sce)$Barcode
cellID_sel <- sample(cellID, 1e+05, replace = FALSE)  # make subsets
sce_sub <- sce[, cellID_sel]

SingleCellExperiment Object

sce_sub
## class: SingleCellExperiment 
## dim: 27998 100000 
## metadata(1): Samples
## assays(1): counts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(100000): CGTGTCTTCGCATGAT-1 TTTATGCGTCGAACAG-1 ...
##   TTATGCTGTGTTTGTG-1 GCTGCGACACGTTGGC-1
## colData names(2): Sample Barcode
## reducedDimNames(0):
## altExpNames(0):
# cell information
colData(sce_sub)[1:2, ]
## DataFrame with 2 rows and 2 columns
##                                    Sample            Barcode
##                               <character>        <character>
## CGTGTCTTCGCATGAT-1 ~/Desktop/01_scSeq_t.. CGTGTCTTCGCATGAT-1
## TTTATGCGTCGAACAG-1 ~/Desktop/01_scSeq_t.. TTTATGCGTCGAACAG-1
# gene information
rowData(sce_sub)[1:2, ]
## DataFrame with 2 rows and 2 columns
##                                    ID      Symbol
##                           <character> <character>
## ENSMUSG00000051951 ENSMUSG00000051951        Xkr4
## ENSMUSG00000089699 ENSMUSG00000089699      Gm1992
sce <- sce_sub

Access UMI counts in each droplet

bcrank <- barcodeRanks(counts(sce))
bcrank[1:2, ]
## DataFrame with 2 rows and 3 columns
##                         rank     total    fitted
##                    <numeric> <numeric> <numeric>
## CGTGTCTTCGCATGAT-1   41200.5         1        NA
## TTTATGCGTCGAACAG-1   41200.5         1        NA

Knee plot

uniq <- !duplicated(bcrank$rank)
# 
plot(bcrank$rank[uniq], bcrank$total[uniq], log = "xy", xlab = "Rank", ylab = "Total UMI count", 
    cex.lab = 1.2)

abline(h = metadata(bcrank)$inflection, col = "darkgreen", lty = 2)
abline(h = metadata(bcrank)$knee, col = "dodgerblue", lty = 2)

legend("bottomleft", legend = c("Inflection", "Knee"), col = c("darkgreen", "dodgerblue"), 
    lty = 2, cex = 1.2)

Knee Plot

## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot

Identify non-empyt droplets

  • limit: lowest counts per cell
  • test.ambient: Could be used to estimate embient RNA contamination
set.seed(100)
limit <- 100
e.out <- emptyDrops(counts(sce), lower = limit, test.ambient = TRUE)
# 
e.out
## DataFrame with 100000 rows and 5 columns
##                        Total     LogProb     PValue   Limited        FDR
##                    <integer>   <numeric>  <numeric> <logical>  <numeric>
## CGTGTCTTCGCATGAT-1         1    -5.34170 0.70912909     FALSE 0.95697174
## TTTATGCGTCGAACAG-1         1    -6.44503 0.50414959     FALSE 0.92603663
## CATGACACAAGTCTGT-1         0          NA         NA        NA         NA
## ACATACGCACCACCAG-1        58  -266.20222 0.01839816     FALSE 0.45640979
## CGTCCATAGCTGCAAG-1      1601 -2917.18710 0.00009999      TRUE 0.00507393
## ...                      ...         ...        ...       ...        ...
## GTGCTTCGTCACCCAG-1         0          NA         NA        NA         NA
## GGAAAGCCAAGGCTCC-1         1    -9.06371 0.20007999     FALSE   0.818381
## TGCCCTAGTAGCTCCG-1         1   -13.51757 0.00309969     FALSE   0.125927
## TTATGCTGTGTTTGTG-1         0          NA         NA        NA         NA
## GCTGCGACACGTTGGC-1         0          NA         NA        NA         NA

Identify non-empty droplets

# Testeed by FDR
summary(e.out$FDR <= 0.001)
##    Mode   FALSE    TRUE    NA's 
## logical   54010     591   45399
# Concordance by testing with FDR and limited
table(Sig = e.out$FDR <= 0.001, Limited = e.out$Limited)
##        Limited
## Sig     FALSE  TRUE
##   FALSE 53525   485
##   TRUE      1   590

distribution of significance of non-empty reads

hist(e.out$PValue[e.out$Total <= limit & e.out$Total > 0], xlab = "P-value", main = "", 
    col = "grey80")

Subset non-empty droplets

sce2 <- sce[, which(e.out$FDR <= 0.001)]

Normalization and clustering


Counts normalization

library(scran)
library(scuttle)
library(scater)
clusters <- quickCluster(sce2)
sce2 <- computeSumFactors(sce2, cluster = clusters)
sce2 <- logNormCounts(sce2)
sce2
## class: SingleCellExperiment 
## dim: 27998 591 
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(591): GCTCCTAAGGCACATG-1 GTAACTGTCGGATGGA-1 ...
##   CACCAGGTCACCTTAT-1 AGGTCATCATGTAAGA-1
## colData names(3): Sample Barcode sizeFactor
## reducedDimNames(0):
## altExpNames(0):

Identify variable features

set.seed(1000)
# modeling variables
dec.pbmc <- modelGeneVarByPoisson(sce2)
# calcualte top features
top.pbmc <- getTopHVGs(dec.pbmc, prop = 0.1)

Make TSNE and UMAP plots

set.seed(1000)
# Evaluate PCs
sce2 <- denoisePCA(sce2, subset.row = top.pbmc, technical = dec.pbmc)
# make TSNE plot
sce2 <- runTSNE(sce2, dimred = "PCA")
# make UMAP plot
sce2 <- runUMAP(sce2, dimred = "PCA")

Graphic based clustering

g <- buildSNNGraph(sce2, k = 10, use.dimred = "PCA")
clust <- igraph::cluster_walktrap(g)$membership
colLabels(sce2) <- factor(clust)
# 
colData(sce2)
## DataFrame with 591 rows and 4 columns
##                                    Sample            Barcode sizeFactor
##                               <character>        <character>  <numeric>
## GCTCCTAAGGCACATG-1 ~/Desktop/01_scSeq_t.. GCTCCTAAGGCACATG-1   0.913482
## GTAACTGTCGGATGGA-1 ~/Desktop/01_scSeq_t.. GTAACTGTCGGATGGA-1   0.970723
## CGGACACCATTACGAC-1 ~/Desktop/01_scSeq_t.. CGGACACCATTACGAC-1   0.725295
## GCGAGAAAGCTACCGC-1 ~/Desktop/01_scSeq_t.. GCGAGAAAGCTACCGC-1   0.700158
## AAACGGGCAGATGGGT-1 ~/Desktop/01_scSeq_t.. AAACGGGCAGATGGGT-1   1.636871
## ...                                   ...                ...        ...
## GTGGGTCAGCACCGTC-1 ~/Desktop/01_scSeq_t.. GTGGGTCAGCACCGTC-1   1.026750
## CGCTATCAGTACGACG-1 ~/Desktop/01_scSeq_t.. CGCTATCAGTACGACG-1   0.677916
## GTACTTTTCCTTTCGG-1 ~/Desktop/01_scSeq_t.. GTACTTTTCCTTTCGG-1   0.978623
## CACCAGGTCACCTTAT-1 ~/Desktop/01_scSeq_t.. CACCAGGTCACCTTAT-1   1.130759
## AGGTCATCATGTAAGA-1 ~/Desktop/01_scSeq_t.. AGGTCATCATGTAAGA-1   1.689680
##                       label
##                    <factor>
## GCTCCTAAGGCACATG-1        2
## GTAACTGTCGGATGGA-1        2
## CGGACACCATTACGAC-1        2
## GCGAGAAAGCTACCGC-1        6
## AAACGGGCAGATGGGT-1        4
## ...                     ...
## GTGGGTCAGCACCGTC-1        6
## CGCTATCAGTACGACG-1        6
## GTACTTTTCCTTTCGG-1        1
## CACCAGGTCACCTTAT-1        7
## AGGTCATCATGTAAGA-1        4

UMAP plot

plotUMAP(sce2, colour_by = "label")

tSNE plot

plotTSNE(sce2, colour_by = "label")

Remove ambient RNA


Abmient RNA

  • cell-free RNAs contaminated in droplet
  • could be estimated by empty droplets
# extrat potential ambient RNA and thee estimated score
amb <- metadata(e.out)$ambient[, 1]
head(amb)
## ENSMUSG00000051951 ENSMUSG00000025902 ENSMUSG00000033845 ENSMUSG00000025903 
##       5.808442e-07       5.808442e-07       1.067188e-04       6.783426e-05 
## ENSMUSG00000033813 ENSMUSG00000002459 
##       5.876210e-05       5.808442e-07

Remove ambient RNA

library(scater)
stripped <- sce2[names(amb), ]
out <- removeAmbience(counts(stripped), ambient = amb, groups = colLabels(stripped))
## Integrate corrected counts
r counts(stripped, withDimnames = FALSE) <- out stripped <- logNormCounts(stripped)
## Before/After removement - Hemoglobin A1 (Hba-a1) as example - in most cases the Hbs are contaminated from residual RBCs
r ensmbl_id <- rowData(sce2)$ID[rowData(sce2)$Symbol == "Hba-a1"] plotExpression(sce2, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("Before") plotExpression(stripped, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("After")
## Before/After removement
## Before/After removement - Krt17 as example
r ensmbl_id <- rowData(sce2)$ID[rowData(sce2)$Symbol == "Krt17"] plotExpression(sce2, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("Before") plotExpression(stripped, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("After")
## Before/After removement
## Save result
r saveRDS(stripped, "scSeq_CTRL_sceSub_rmAmbRNA.rds")
# Remove doublets

Doublets

  • Doublets means two or more cells clumped in a single droplet. Thus, the read counts and genes detected in this droplet would be much higher than other droplets.
  • here, we demonstrate how to identify doublets by silmulation with scran

Normalization and clustering

dec <- modelGeneVar(stripped)
hvgs <- getTopHVGs(dec, n = 1000)
stripped <- runPCA(stripped, ncomponents = 10, subset_row = hvgs)
stripped <- runUMAP(stripped, dimred = "PCA")
g <- buildSNNGraph(stripped, k = 10, use.dimred = "PCA")
clust <- igraph::cluster_walktrap(g)$membership
colLabels(stripped) <- factor(clust)
plotUMAP(stripped, colour_by = "label")

Normalization and clustering

Estimate doublets

dbl.dens <- computeDoubletDensity(stripped, #subset.row=top.mam, 
    d=ncol(reducedDim(stripped)),subset.row=hvgs)
summary(dbl.dens)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2908  0.7978  1.1442  1.2442  1.6010  4.1961
stripped$DoubletScore <- dbl.dens

Plot doublets scores ~ UMAP

plotUMAP(stripped, colour_by = "DoubletScore")

Plot doublets scores by cluster

plotColData(stripped, x = "label", y = "DoubletScore", colour_by = "label") + geom_hline(yintercept = quantile(colData(stripped)$DoubletScore, 
    0.95), lty = "dashed", color = "red")

- No clusters have significantly higher doublet scores than other clusters.No clusters would be removed. - Red dash line representd 95% quantile of doublet score. The cells with higher doublrt score than this cut-off would be removed.

Remove doublets

cut_off <- quantile(stripped$DoubletScore, 0.95)
stripped$isDoublet <- c("no", "yes")[factor(as.integer(stripped$DoubletScore >= cut_off), 
    levels = c(0, 1))]
table(stripped$isDoublet)
## 
##  no yes 
## 561  30
sce_clean <- stripped[stripped$isDoublet == "no", ]
saveRDS(sce_clean, "scSeq_CTRL_sceSub_rmAmbRNA_rmDoublet.rds")

QC plots after cleanrance


QC plots

  • mitochondrial content
  • Genes detected
  • Reads per cell
library(scater)
mtGene <- rowData(sce_clean)$ID[grepl(rowData(sce_clean)$Symbol, pattern = "mt-")]
is.mito <- names(sce_clean) %in% mtGene
sce_clean <- addPerCellQC(sce_clean, subsets = list(Mito = is.mito))

QC plots

  • violin plot of read counts, gene counts, and mitochondrial contents per cell
plotColData(sce_clean, x = "label", y = "sum", colour_by = "label") + ggtitle("read counts")

plotColData(sce_clean, x = "label", y = "detected", colour_by = "label") + ggtitle("gene counts")

plotColData(sce_clean, x = "label", y = "subsets_Mito_percent", colour_by = "label") + 
    ggtitle("mitocondrial content")

QC plots ~ comparison

  • mitochondrial contents vs read counts
  • gene counts vs read counts
plotColData(sce_clean, x = "sum", y = "subsets_Mito_percent", colour_by = "label") + 
    ggtitle("is.mito vs read counts")

plotColData(sce_clean, x = "sum", y = "detected", colour_by = "label") + ggtitle("gene counts vs read counts")

Estimate variance explaination

  • clustering (leable), mitochindrial content (subsets_Mito_percent), doublets (DoubletScore), read counts (sum), and gene counts (detected) were tested.
  • We would suppose “label” (clustering) would explain more variances than other controls.
vars <- getVarianceExplained(sce_clean, variables = c("DoubletScore", "label", "sum", 
    "detected", "subsets_Mito_percent"))
plotExplanatoryVariables(vars)
## Warning: Removed 2765 rows containing non-finite values (stat_density).

Exercise Time